/*******************************************************************************

Name: Table 1 - Summary statistics of water system characteristics

Authors: Jonathan Baker, Lori Bennear, Sheila Olmstead

Input files:
	Primary panel: natlCWS_yr_Panel_mod-small_v2-Match.dta (main panel)
	Raw input:
		1. census data: 90census.dta
		2. election results: preselections_countyresults.dta
	Intermediate files created from other code:
		1. pwsid90census.dta
		2. pwsid_panel_pres_elections.dta

Notes: run Table6_heterogeneity.do before running this code in order to generate
	   the two intermediate files noted above.
		
*******************************************************************************/


* ========================== START START START ==========================
clear
clear matrix
set more off

* ______________________________________________________________________________
* set paths and create log file
global root "C:/Users/jbaker/Documents/Research/SDWIS/Analysis/STATA"
global input "$root/Input"
global intermediate "$root/Intermediate"
global log "$root/LogFiles"
global output "$root/Output"
global outpanel "$root/OutputPanel"
global figures "$root/Figures"

log using "$log/Table1_systemSummaryStatistics.log", replace name(systemstats)



* ============================================================================== 
* table of system characteristics

use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match", clear
keep if id2001==1

// creating population-category variables
gen pcat0 = (inrange(population_served_count,0,500)) // population category
gen pcat1 = (inrange(population_served_count,501,9999))
gen pcat2 = (inrange(population_served_count,10000,99999))
gen pcat3 = (population_served_count >= 100000)

foreach i of numlist 0/3 {
	gen pop`i' = population_served_count if pcat`i' == 1 // population for the four water system groups
	}
foreach i of numlist 1/10 {
	gen r`i' = 0
	replace r`i' = 1 if epa_reg == `i' // define epa_region indicator
	}

local Nrows = 21
mat avg = J(`Nrows',5,.) // returns Nrows x 5 matrix
set more off
local rows population pop0 pop1 pop2 pop3 T_501 T_10k T_100k r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 surface public purchaser 
* epa_reg income1 income2 income3 income4 income5 income6 income7 income8 income9 income10
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize `r' if year == 1990
	mat avg[`rcnt',1] = r(mean)
	mat avg[`rcnt',2] = r(sd)
	mat avg[`rcnt',3] = r(min)
	mat avg[`rcnt',4] = r(max)
	mat avg[`rcnt',5] = r(N)
	}
mat rownames avg = "service population (all systems)" "service population (0-500)" ///
 "service population (501-9,999)" "service population (10k-99,999)" ///
 "service population (>=100k)" "Tpub=ind(service pop. >500)" ///
 "Tmail=ind(service pop. >= 10k)" "Tweb=ind(service pop. >= 100k)" ///
 "EPA Region 1" "EPA Region 2" "EPA Region 3" "EPA Region 4" "EPA Region 5" ///
 "EPA Region 6" "EPA Region 7" "EPA Region 8" "EPA Region 9" "EPA Region 10" ///
 "systems using surface water" "publicly owned systems" "systems purchasing water" 

putexcel set "$output\Table1_systemSummaryStatistics.xlsx", sheet(systemStats, replace) modify
putexcel A1 = matrix(avg), rownames nformat(number_d2)

* ============================================================================== 
* merge 1990 census with panel

use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
xtset pwsidNUM year

// merge census data with water violations panel
sort pwsid year
merge m:1 pwsid using "$intermediate/pwsid90census.dta"
	tab _merge
	// about 14% of counties don't match
	// keep if _merge == 3
	// drop _merge
	sort pwsidNUM year

tab year if id2001==1 // sanity check
tab year if id2001 == 1 & _merge == 3 // additional sanity check

* ==============================================================================
* create table of summary stats for demographic variables

local Nrows = 6
mat avg = J(`Nrows',5,.) // returns Nrows x 5 matrix
set more off
local rows mean_family_inc perc_white perc_black perc_hs_ed perc_some_college perc_college
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize `r' if id2001 == 1 & year == 1990 & _merge == 3
	mat avg[`rcnt',1] = r(mean)
	mat avg[`rcnt',2] = r(sd)
	mat avg[`rcnt',3] = r(min)
	mat avg[`rcnt',4] = r(max)
	mat avg[`rcnt',5] = r(N)
	}
mat rownames avg = "Mean family income" "Percent white" "Percent black" ///
"Percent high school education" "Percent some college" "Percent college"

putexcel set "$output/Table1_systemSummaryStatistics.xlsx", sheet(systems-demogrph, replace) modify
putexcel A1 = matrix(avg), rownames nformat(number_d2)

* ==============================================================================
* create table of summary stats for demographic variables (for all counties)

use "$input/90census.dta", clear

gen fips = state_fip + county_fip
duplicates report, fips
drop state_fip county_fip

local Nrows = 6
mat avg = J(`Nrows',5,.) // returns Nrows x 5 matrix
set more off
local rows mean_family_inc perc_white perc_black perc_hs_ed perc_some_college perc_college
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize `r'
	mat avg[`rcnt',1] = r(mean)
	mat avg[`rcnt',2] = r(sd)
	mat avg[`rcnt',3] = r(min)
	mat avg[`rcnt',4] = r(max)
	mat avg[`rcnt',5] = r(N)
	}
mat rownames avg = "Mean family income" "Percent white" "Percent black" ///
"Percent high school education" "Percent some college" "Percent college"

putexcel set "$output/Table1_systemSummaryStatistics.xlsx", sheet(nation-demogrph, replace) modify
putexcel A1 = matrix(avg), rownames nformat(number_d2)

* ============================================================================== 
* merge election results with panel

use "$outpanel/natlCWS_yr_Panel_mod-small_v2-Match.dta", clear
sort pwsid year
duplicates report pwsid year
		merge 1:1 pwsid year using "$intermediate/pwsid_panel_pres_elections.dta"
		tab year if _merge == 1
		tab _merge if id2001 == 1
		keep if _merge == 3
		sort pwsidNUM year
		isid pwsidNUM year

rename repvotestotalpercent rtp
rename demvotestotalpercent dtp
rename repvotesmajorpercent rmp
rename demvotesmajorpercent dmp

bys pwsid: egen meandwin = mean(Dwins)
duplicates drop pwsid, force


* ==============================================================================
* create table of summary stats for election variables
local Nrows = 1
mat avg = J(`Nrows',5,.) // returns Nrows x 5 matrix
set more off
local rows meandwin
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize `r' if id2001 == 1 & _merge == 3
	mat avg[`rcnt',1] = r(mean)
	mat avg[`rcnt',2] = r(sd)
	mat avg[`rcnt',3] = r(min)
	mat avg[`rcnt',4] = r(max)
	mat avg[`rcnt',5] = r(N)
	}
mat rownames avg = "Democrat wins"

putexcel set "$output/Table1_systemSummaryStatistics.xlsx", sheet(systems-elec, replace) modify
putexcel A1 = matrix(avg), rownames nformat(number_d2)

* ==============================================================================
* create table of summary stats for election variables (for all counties)
use "$input/preselections_countyresults.dta", clear

destring(*percent), replace
gen Dwins = 0
	replace Dwins = 1 if demvotesmajorpercent > 50
	replace Dwins = 1 if demvotesmajorpercent == 50 & pluralityparty == "D" 

duplicates report, state area

bys state area: egen meandwin = mean(Dwins)
duplicates drop state area, force

local Nrows = 1
mat avg = J(`Nrows',5,.) // returns Nrows x 5 matrix
set more off
local rows meandwin
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize `r'
	mat avg[`rcnt',1] = r(mean)
	mat avg[`rcnt',2] = r(sd)
	mat avg[`rcnt',3] = r(min)
	mat avg[`rcnt',4] = r(max)
	mat avg[`rcnt',5] = r(N)
	}
mat rownames avg = "Democrat wins"

putexcel set "$output/Table1_systemSummaryStatistics.xlsx", sheet(nation-elec, replace) modify
putexcel A1 = matrix(avg), rownames nformat(number_d2)

* ========================== DONE DONE DONE ==========================
log close systemstats
